{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import (absolute_import, division,\n",
    "                        print_function, unicode_literals)\n",
    "from builtins import *\n",
    "from io import open"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from NoisyNLP.utils import *\n",
    "from NoisyNLP.features import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocab: 8023, cat_names: set([u'product', u'facility', u'movie', u'company', u'sportsteam', u'musicartist', u'person', u'other', u'geo-loc', u'tvshow'])\n",
      "Train: 2394, Dev: 1420, Test: 3856\n"
     ]
    }
   ],
   "source": [
    "train_sequences = load_sequences(\"./data/cleaned/train.BIEOU.tsv\", sep=\"\\t\", notypes=False)\n",
    "dev_sequences = (load_sequences(\"./data/cleaned/dev.BIEOU.tsv\", sep=\"\\t\", notypes=False) \n",
    "                 + load_sequences(\"./data/cleaned/dev_2015.BIEOU.tsv\", sep=\"\\t\", notypes=False))\n",
    "test_sequences = load_sequences(\"./data/cleaned/test.BIEOU.tsv\", sep=\"\\t\", notypes=False)\n",
    "vocab = load_vocab(\"./vocab.no_extras.txt\")\n",
    "cat_names = get_cat_names(train_sequences)\n",
    "print(\"Vocab: %s, cat_names: %s\\nTrain: %s, Dev: %s, Test: %s\" % (len(vocab), cat_names, len(train_sequences), len(dev_sequences), len(test_sequences)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "all_sentences = [[t[0] for t in seq] for seq in (train_sequences+dev_sequences)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "results = \"\"\"processed 23050 tokens with 937 phrases; found: 748 phrases; correct: 343.\n",
    "accuracy:  94.34%; precision:  45.86%; recall:  36.61%; FB1:  40.71\n",
    "          company: precision:  46.94%; recall:  31.94%; FB1:  38.02  49\n",
    "         facility: precision:  10.81%; recall:   8.70%; FB1:   9.64  37\n",
    "          geo-loc: precision:  45.45%; recall:  58.64%; FB1:  51.21  209\n",
    "            movie: precision:   0.00%; recall:   0.00%; FB1:   0.00  8\n",
    "      musicartist: precision:  28.57%; recall:   3.51%; FB1:   6.25  7\n",
    "            other: precision:  26.45%; recall:  17.88%; FB1:  21.33  121\n",
    "           person: precision:  61.39%; recall:  65.16%; FB1:  63.22  259\n",
    "          product: precision:  24.14%; recall:  15.22%; FB1:  18.67  29\n",
    "       sportsteam: precision:  77.78%; recall:  20.00%; FB1:  31.82  27\n",
    "           tvshow: precision:   0.00%; recall:   0.00%; FB1:   0.00  2\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{u'overall_accuracy': 94.34, u'processed_tokens': u'23050', u'prfs':        category  precision  recall     F1  support\n",
      "0       overall      45.86   36.61  40.71      0.0\n",
      "1       company      46.94   31.94  38.02     49.0\n",
      "2      facility      10.81    8.70   9.64     37.0\n",
      "3       geo-loc      45.45   58.64  51.21    209.0\n",
      "4         movie       0.00    0.00   0.00      8.0\n",
      "5   musicartist      28.57    3.51   6.25      7.0\n",
      "6         other      26.45   17.88  21.33    121.0\n",
      "7        person      61.39   65.16  63.22    259.0\n",
      "8       product      24.14   15.22  18.67     29.0\n",
      "9    sportsteam      77.78   20.00  31.82     27.0\n",
      "10       tvshow       0.00    0.00   0.00      2.0, u'found': u'748', u'total': u'937', u'correct': u'343'}\n"
     ]
    }
   ],
   "source": [
    "r = parse_results(results)\n",
    "print(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>precision</th>\n",
       "      <th>recall</th>\n",
       "      <th>F1</th>\n",
       "      <th>support</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>overall</td>\n",
       "      <td>45.86</td>\n",
       "      <td>36.61</td>\n",
       "      <td>40.71</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>company</td>\n",
       "      <td>46.94</td>\n",
       "      <td>31.94</td>\n",
       "      <td>38.02</td>\n",
       "      <td>49.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>facility</td>\n",
       "      <td>10.81</td>\n",
       "      <td>8.70</td>\n",
       "      <td>9.64</td>\n",
       "      <td>37.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>geo-loc</td>\n",
       "      <td>45.45</td>\n",
       "      <td>58.64</td>\n",
       "      <td>51.21</td>\n",
       "      <td>209.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>movie</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>musicartist</td>\n",
       "      <td>28.57</td>\n",
       "      <td>3.51</td>\n",
       "      <td>6.25</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>other</td>\n",
       "      <td>26.45</td>\n",
       "      <td>17.88</td>\n",
       "      <td>21.33</td>\n",
       "      <td>121.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>person</td>\n",
       "      <td>61.39</td>\n",
       "      <td>65.16</td>\n",
       "      <td>63.22</td>\n",
       "      <td>259.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>product</td>\n",
       "      <td>24.14</td>\n",
       "      <td>15.22</td>\n",
       "      <td>18.67</td>\n",
       "      <td>29.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>sportsteam</td>\n",
       "      <td>77.78</td>\n",
       "      <td>20.00</td>\n",
       "      <td>31.82</td>\n",
       "      <td>27.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>tvshow</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       category  precision  recall     F1  support\n",
       "0       overall      45.86   36.61  40.71      0.0\n",
       "1       company      46.94   31.94  38.02     49.0\n",
       "2      facility      10.81    8.70   9.64     37.0\n",
       "3       geo-loc      45.45   58.64  51.21    209.0\n",
       "4         movie       0.00    0.00   0.00      8.0\n",
       "5   musicartist      28.57    3.51   6.25      7.0\n",
       "6         other      26.45   17.88  21.33    121.0\n",
       "7        person      61.39   65.16  63.22    259.0\n",
       "8       product      24.14   15.22  18.67     29.0\n",
       "9    sportsteam      77.78   20.00  31.82     27.0\n",
       "10       tvshow       0.00    0.00   0.00      2.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r[\"prfs\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running:\n",
      "cat \"./test_wv.crf.bieou.tsv\" | tr '\\t' ' ' | perl -ne '{chomp;s/\\r//g;print $_,\"\\n\";}' | python NoisyNLP/conlleval.py\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{u'correct': '839',\n",
       " u'found': '1661',\n",
       " u'overall_accuracy': 91.84,\n",
       " u'prfs':        category  precision  recall     F1  support\n",
       " 0       overall      50.51   24.05  32.58      0.0\n",
       " 1       company      63.01   14.79  23.96    146.0\n",
       " 2      facility      43.33   15.42  22.74     90.0\n",
       " 3       geo-loc      66.38   52.71  58.76    702.0\n",
       " 4         movie       0.00    0.00   0.00      9.0\n",
       " 5   musicartist      20.00    1.04   1.98     10.0\n",
       " 6         other      29.14    8.63  13.32    175.0\n",
       " 7        person      40.44   33.88  36.87    408.0\n",
       " 8       product      18.31    5.28   8.20     71.0\n",
       " 9    sportsteam      24.39    6.80  10.64     41.0\n",
       " 10       tvshow      11.11    3.03   4.76      9.0,\n",
       " u'processed_tokens': '61908',\n",
       " u'total': '3489'}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_conll_eval(\"./test_wv.crf.bieou.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialized RegexFeature\n"
     ]
    }
   ],
   "source": [
    "regex_features = RegexFeatures()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'containsDigit': True, u'isAlphaNumeric': True}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regex_features.process(\"ABC123\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "read dict cap.1000\n"
     ]
    }
   ],
   "source": [
    "dict_features = DictionaryFeatures(\"./data/test/lexicons\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'DICT=cap.1000']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dict_features.GetDictFeatures([\"new\", \"york\", \"is\", \"a\", \"great\", \"place\", \"in\", \"america\"], 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'DICT_HASHTAG=cap.1000']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dict_features.GetHashtagDictFeatures(\"#york\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sequences = [(\"the\", \"man\", \"crossed\", \"the\", \"road\", \"in\", \"new\", \"york\"),\n",
    "             (\"new\", \"york\", \"is\", \"a\", \"great\", \"place\", \"in\", \"america\")\n",
    "            ]\n",
    "\n",
    "word2vec = WordVectors(sequences, \"./data/test/glove.twitter.200d.txt.processed.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(word2vec.model.vocab.keys()) == set(sum(sequences, ()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'a': 2,\n",
       " u'america': 0,\n",
       " u'crossed': 1,\n",
       " u'great': 1,\n",
       " u'in': 1,\n",
       " u'is': 1,\n",
       " u'man': 2,\n",
       " u'new': 1,\n",
       " u'place': 0,\n",
       " u'road': 0,\n",
       " u'the': 1,\n",
       " u'york': 1}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2vec.get_clusters(n_clusters=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.01950729, -0.0318368 , -0.03900977, ..., -0.04715571,\n",
       "         0.06079158, -0.01819291],\n",
       "       [ 0.0069145 , -0.00876959,  0.06516815, ...,  0.00377096,\n",
       "         0.0426897 , -0.05388267],\n",
       "       [ 0.03771846,  0.02122468, -0.05407947, ..., -0.04112153,\n",
       "         0.03916675, -0.00868494],\n",
       "       ..., \n",
       "       [ 0.01300056, -0.03810946,  0.0111307 , ..., -0.07383171,\n",
       "         0.05161401,  0.01348415],\n",
       "       [ 0.06153985,  0.03178791,  0.03036125, ...,  0.08684369,\n",
       "        -0.0501973 ,  0.00344352],\n",
       "       [ 0.02337532,  0.10477841, -0.08604416, ...,  0.05988278,\n",
       "        -0.013699  ,  0.08674569]], dtype=float32)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2vec.model.syn0norm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cf = ClusterFeatures(\"data/test/brown_clusters/\", cluster_type=\"brown\")\n",
    "cf.set_cluster_file_path(\"data/test/brown_clusters/paths\")\n",
    "clusters = cf.read_clusters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'15mins': u'00000',\n",
       " u'Burrows': u'00000',\n",
       " u'Carolina': u'00000',\n",
       " u'Mythological': u'00000',\n",
       " u'day-to-day': u'00000',\n",
       " u'elbows': u'00000',\n",
       " u'rearview': u'00000',\n",
       " u'sec': u'00000',\n",
       " u'sexiness': u'00000',\n",
       " u'walls': u'00000'}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cf = ClusterFeatures(\"data/test/clark_clusters/\", cluster_type=\"clark\")\n",
    "cf.set_cluster_file_path(\"data/test/clark_clusters/clark_clusters.32.txt\")\n",
    "clusters = cf.read_clusters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'!': (u'1', 0.183724),\n",
       " u\"'s\": (u'30', 0.303898),\n",
       " u',': (u'27', 0.553379),\n",
       " u'.': (u'1', 0.439693),\n",
       " u'...': (u'1', 0.121445),\n",
       " u':': (u'13', 0.655373),\n",
       " u'I': (u'5', 0.404412),\n",
       " u'RT': (u'22', 0.613343),\n",
       " u'a': (u'14', 0.273296),\n",
       " u'and': (u'27', 0.249616),\n",
       " u'at': (u'2', 0.115745),\n",
       " u'for': (u'2', 0.146101),\n",
       " u'in': (u'2', 0.156466),\n",
       " u'it': (u'10', 0.224207),\n",
       " u'of': (u'2', 0.127345),\n",
       " u'on': (u'2', 0.136476),\n",
       " u'the': (u'14', 0.359631),\n",
       " u'to': (u'28', 0.925121),\n",
       " u'you': (u'19', 0.452206)}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
